The Journal of Open Source Software (JOSS)

Bringing open source software practices to the scholarly publishing community for authors, reviewers, editors, and publishers

Published

July 16, 2024

Introduction

This report contains the code for generating figures and numbers presented in

Diehl et al (2024): The Journal of Open Source Software (JOSS) - Bringing open source software practices to the scholarly publishing community for authors, reviewers, editors, and publishers

Load packages

We first load the R packages that are needed for the analysis and plotting.

Code
suppressPackageStartupMessages({
    library(ggplot2)
    library(dplyr)
    library(tidyr)
    library(cowplot)
    library(ggridges)
    library(lubridate)
    library(kableExtra)
    library(patchwork)
    library(purrr)
    library(gh)
    library(openalexR)
})

Read data

Next, we download the data from https://www.theoj.org/joss-analytics/joss-submission-analytics.html.

Code
if (params$pull_data) {
    papers <- readRDS(gzcon(url("https://github.com/openjournals/joss-analytics/blob/gh-pages/joss_submission_analytics.rds?raw=true")))
    saveRDS(papers, file = "papers.rds")
}
papers <- readRDS("papers.rds")

Create summary tables

For easier plotting later, we create several summary tables.

Code
## Generate publication month and year, remove two retracted papers
papers <- papers |>
    dplyr::mutate(pubmonth = lubridate::floor_date(published.date, "month"),
                  pubyear = factor(lubridate::year(published.date))) |>
    dplyr::filter(api_state != "retracted")

## Monthly summary - number of papers
summarydf_month <- papers |>
    dplyr::group_by(pubmonth) |>
    dplyr::summarize(npub = n(), 
                     pubyear = unique(lubridate::year(published.date)),
                     .groups = "drop")

## Yearly summary - number of papers, editors and reviewers
summarydf_year <- papers |>
    dplyr::group_by(pubyear) |>
    dplyr::summarize(npub = n(), 
                     .groups = "drop") |>
    dplyr::full_join(
        ## editors
        papers |> 
            dplyr::select(pubyear, editor) |>
            tidyr::separate_longer_delim(editor, delim = ",") |>
            dplyr::group_by(pubyear) |>
            dplyr::summarize(nbr_editors = length(unique(editor)), 
                             .groups = "drop"),
        by = join_by(pubyear)
    ) |>
    dplyr::full_join(
        ## new editors
        papers |> 
            dplyr::select(pubyear, editor) |>
            tidyr::separate_longer_delim(editor, delim = ",") |>
            dplyr::arrange(pubyear) |>
            dplyr::filter(!duplicated(editor)) |>
            dplyr::group_by(pubyear) |>
            dplyr::summarize(nbr_new_editors = length(unique(editor)), 
                             .groups = "drop") ,
        by = join_by(pubyear)
    ) |>
    dplyr::full_join(
        ## reviewers
        papers |> 
            dplyr::select(pubyear, reviewers) |>
            tidyr::separate_longer_delim(reviewers, delim = ",") |>
            dplyr::group_by(pubyear) |>
            dplyr::summarize(nbr_reviewers = length(unique(reviewers)),
                             nbr_reviews = length(reviewers),
                             .groups = "drop"),
        by = join_by(pubyear)
    ) |>
    dplyr::full_join(
        ## new reviewers
        papers |> 
            dplyr::select(pubyear, reviewers) |>
            tidyr::separate_longer_delim(reviewers, delim = ",") |>
            dplyr::arrange(pubyear) |> 
            dplyr::filter(!duplicated(reviewers)) |>
            dplyr::group_by(pubyear) |>
            dplyr::summarize(nbr_new_reviewers = length(unique(reviewers)),
                             .groups = "drop"),
        by = join_by(pubyear)
    ) |>
    dplyr::mutate(frac_new_editors = scales::percent(nbr_new_editors/nbr_editors, 
                                                     accuracy = 0.01),
                  frac_new_reviewers = scales::percent(nbr_new_reviewers/nbr_reviewers,
                                                       accuracy = 0.01))

## Total reviewer/editor counts
tot_nbr_editors <- papers |>
    dplyr::select(editor) |>
    tidyr::separate_longer_delim(editor, delim = ",") |>
    dplyr::pull(editor) |>
    unique() |>
    length()

tot_nbr_reviewers <- papers |>
    dplyr::select(reviewers) |>
    tidyr::separate_longer_delim(reviewers, delim = ",") |>
    dplyr::pull(reviewers) |>
    unique() |>
    length()

Monthly summary

Code
DT::datatable(summarydf_month, 
              escape = FALSE, rownames = FALSE, 
              filter = list(position = 'top', clear = FALSE),
              options = list(scrollX = TRUE))

Yearly summary

Code
DT::datatable(summarydf_year,
              escape = FALSE, rownames = FALSE, 
              filter = list(position = 'top', clear = FALSE),
              options = list(scrollX = TRUE))

Define color palette

Code
## Years
yearcols <- c("#FF0066FF", "#328C97FF", "#D1AAC2FF", "#B3E0BFFF",
              "#DB7003FF", "#F8C1A6FF", "#A30000FF", "#97D1D9FF", "#916C37FF")
names(yearcols) <- as.character(2016:2024)

Number of papers published per month

The figure below shows the number of papers published each month. The overlaid curve represents a loess fit to the monthly data, generated using the ggplot2 package.

Code
(gg_nbrpapers <- ggplot(summarydf_month, 
                        aes(x = factor(pubmonth), y = npub)) + 
     geom_col(aes(fill = as.character(pubyear))) + 
     geom_smooth(aes(x = as.numeric(factor(pubmonth))), method = "loess", 
                 se = FALSE, color = "black") + 
     labs(x = "Publication month", y = "Number of published\npapers per month") + 
     scale_y_continuous(expand = c(0, 0)) + 
     scale_fill_manual(name = "Year", values = yearcols) + 
     guides(fill = guide_legend(nrow = 1, byrow = TRUE, 
                                title = "Publication year")) + 
     theme_cowplot() + 
     theme(axis.title = element_text(size = 15),
           axis.text.x = element_blank(),
           axis.ticks.x = element_blank(), 
           legend.position = "bottom",
           legend.justification = "right",
           legend.margin = margin(1, 10, 1, 1)))

Number of editors per year

The next figure shows the number of editors that accept at least one paper in a given year, as well as the total number of editors that have accepted at least one paper overall. Note that the data from 2024 only include papers published until 2024-07-11.

Code
(gg_nbreditors <- ggplot(summarydf_year,
                         aes(x = pubyear, y = nbr_editors, fill = pubyear)) + 
     scale_fill_manual(name = "Year", values = yearcols) + 
     geom_col() +
     annotate(geom = "text", x = 1, y = 0.9 * max(summarydf_year$nbr_editors), 
              label = paste0("Total number\nof editors: ", tot_nbr_editors), 
              hjust = 0, vjust = 1, size = 5) + 
     scale_y_continuous(expand = c(0, 0)) + 
     labs(x = "Publication year", y = "Number of editors") + 
     theme_cowplot() + 
     theme(legend.position = "none"))

Number of new editors per year

We next illustrate the number of editors that accept their first paper in a given year, as well as what fraction this represents of the total number of editors accepting a paper that year. Note that the data from 2024 only include papers published until 2024-07-11.

Code
ggplot(summarydf_year,
       aes(x = pubyear, y = nbr_new_editors, fill = pubyear)) + 
    scale_fill_manual(name = "Year", values = yearcols) + 
    geom_col() +
    geom_text(aes(label = frac_new_editors), vjust = -0.2) + 
    scale_y_continuous(expand = expansion(mult = c(0, .1))) + 
    labs(x = "", y = "Number of new editors and\npercentage of total number of editors") + 
    theme_cowplot() + 
    theme(legend.position = "none")

Code
## Combine - show both the total number of editors and the number of new ones
ggplot(summarydf_year,
       aes(x = pubyear, fill = pubyear)) + 
    scale_fill_manual(name = "Year", values = yearcols) + 
    geom_col(aes(y = nbr_editors), alpha = 0.25) +
    geom_col(aes(y = nbr_new_editors)) + 
    geom_text(aes(y = nbr_new_editors, label = frac_new_editors), vjust = -0.2) + 
    scale_y_continuous(expand = c(0, 0)) + 
    labs(x = "", y = "Number of new editors and\npercentage of total number of editors") + 
    theme_cowplot() + 
    theme(legend.position = "none")

Number of reviewers per year

Similarly to the number of editors above, the figure below shows the number of reviewers reviewing at least one paper in a given year, as well as the total number of reviews submitted in a year. Also here, the data from 2024 only include papers published until 2024-07-11.

Code
(gg_nbrreviewers <- ggplot(summarydf_year,
                           aes(x = pubyear, y = nbr_reviewers)) + 
        scale_fill_manual(name = "Year", values = yearcols) + 
        geom_col(aes(fill = pubyear)) +
        geom_line(aes(x = as.numeric(pubyear), y = nbr_reviews), color = "grey",
                  linewidth = 1.5) + 
        geom_point(aes(x = as.numeric(pubyear), y = nbr_reviews), color = "grey",
                   size = 2.5) + 
        geom_col(aes(fill = pubyear)) +
        annotate(geom = "text", x = 1, y = 0.9 * max(summarydf_year$nbr_reviews), 
                 label = paste0("Total number\nof reviewers: ", tot_nbr_reviewers), 
                 hjust = 0, vjust = 1, size = 5) + 
        scale_y_continuous(expand = expansion(mult = c(0, .05))) + 
        labs(x = "Publication year", y = "Number of reviewers\nand reviews", 
             caption = "Bars show number of unique reviewers,\ngrey line shows total number of reviews") + 
        theme_cowplot() + 
        theme(legend.position = "none"))

Number of ‘new’ reviewers per year

We also plot the number of reviewers reviewing their first paper in a given year, and calculate what fraction of the total number of reviewers that year that this represents. As above, the data from 2024 only include papers published until 2024-07-11.

Code
ggplot(summarydf_year,
       aes(x = pubyear, y = nbr_new_reviewers)) + 
    scale_fill_manual(name = "Year", values = yearcols) + 
    geom_col(aes(fill = pubyear)) +
    geom_text(aes(label = frac_new_reviewers), vjust = -0.2) + 
    scale_y_continuous(expand = expansion(mult = c(0, .1))) + 
    labs(x = "", y = "Number of first-time reviewers and\npercentage of total number of reviewers") + 
    theme_cowplot() + 
    theme(legend.position = "none")

Number of reviewers per submission

Here we illustrate the distribution of the number of reviewers assigned to each submission, over time. We exclude submissions that have already been reviewed at rOpenSci or pyOpenSci, since they are not re-reviewed at JOSS.

Code
nrev <- papers |>
    dplyr::filter(!grepl("rOpenSci|pyOpenSci", prerev_labels)) |>
    dplyr::select(pubyear, title, nbr_reviewers, doi)
(gg_nrevpersub <- ggplot(
    nrev, aes(x = nbr_reviewers, 
              fill = forcats::fct_relevel(pubyear, 
                                          rev(levels(pubyear))))) + 
        geom_bar() + 
        scale_fill_manual(values = yearcols, name = "Year") + 
        scale_y_continuous(expand = c(0, 0)) + 
        labs(x = "Number of reviewers per submissions", y = "Number of submissions",
             caption = "Submissions reviewed via rOpenSci/pyOpenSci are excluded") + 
        theme_cowplot() + 
        theme(legend.position = "none"))

Since 2020, all papers are reviewed by at least two reviewers. The handful of exceptions represent two addendum papers and three cases where the editor replaced one reviewer who dropped out during the process.

Time in review

In these plots we investigate how the time a submission spends in the pre-review or review stage (or their sum) has changed over time. The curve corresponds to a rolling median for submissions over 120 days.

Code
## Helper functions (modified from https://stackoverflow.com/questions/65147186/geom-smooth-with-median-instead-of-mean)
rolling_median <- function(formula, data, xwindow = 120, ...) {
    ## Get order of x-values and sort x/y
    ordr <- order(data$x)
    x <- data$x[ordr]
    y <- data$y[ordr]
    
    ## Initialize vector for smoothed y-values
    ys <- rep(NA, length(x))
    ## Calculate median y-value for each unique x-value
    for (xs in setdiff(unique(x), NA)) {
        ## Get x-values in the window, and calculate median of corresponding y
        j <- ((xs - xwindow/2) < x) & (x < (xs + xwindow/2))
        ys[x == xs] <- median(y[j], na.rm = TRUE)
    }
    y <- ys
    structure(list(x = x, y = y, f = approxfun(x, y)), class = "rollmed")
}

predict.rollmed <- function(mod, newdata, ...) {
    setNames(mod$f(newdata$x), newdata$x)
}
Code
data.frame(`Median number of days in pre-review` = 
               round(median(papers$days_in_pre, na.rm = TRUE), 1),
           `Mean number of days in pre-review` = 
               round(mean(papers$days_in_pre, na.rm = TRUE), 1),
           `Median number of days in review` = 
               round(median(papers$days_in_rev, na.rm = TRUE), 1),
           `Mean number of days in review` = 
               round(mean(papers$days_in_rev, na.rm = TRUE), 1),
           `Median number of days in pre-review + review` = 
               round(median(papers$days_in_pre + 
                                papers$days_in_rev, na.rm = TRUE), 1),
           `Mean number of days in pre-review + review` = 
               round(mean(papers$days_in_pre + 
                              papers$days_in_rev, na.rm = TRUE), 1),
           check.names = FALSE
           ) |>
    tidyr::pivot_longer(everything()) |>
    kableExtra::kbl(col.names = NULL) |>
    kableExtra::kable_styling()
Median number of days in pre-review 20.0 days
Mean number of days in pre-review 31.8 days
Median number of days in review 68.0 days
Mean number of days in review 93.0 days
Median number of days in pre-review + review 100.0 days
Mean number of days in pre-review + review 125.6 days
Code
textannot <- paste0("Median time in:\n", 
                    "Pre-review: ", 
                    round(median(papers$days_in_pre, na.rm = TRUE), 1), " days\n",
                    "Review: ",
                    round(median(papers$days_in_rev, na.rm = TRUE), 1), " days\n",
                    "Pre-review + review: ",
                    round(median(papers$days_in_pre + 
                                     papers$days_in_rev, na.rm = TRUE), 1), " days")
Code
(gg_timeinrev <- ggplot(papers, aes(x = prerev_opened, 
                                    y = as.numeric(days_in_pre) + as.numeric(days_in_rev),
                                    color = pubyear)) + 
        geom_point() +
        annotate(geom = "text", x = as.Date("2016-10-01"), y = 950, 
                 label = textannot, hjust = 0) + 
        geom_smooth(formula = y ~ x, method = "rolling_median", 
                    se = FALSE, method.args = list(xwindow = 120),
                    color = "black") +
        scale_color_manual(values = yearcols, name = "Year") + 
        labs(x = "Date of pre-review opening", y = "Number of days in\npre-review + review") + 
        theme_cowplot() + 
        theme(legend.position = "none"))

Number of comments per review issue

Here, we count the number of comments made in the review GitHub issues for each submission. We remove comments made by the editorial bot (user name @whedon or @editorialbot). Note that issues opened in the software repositories themselves, or comments therein, are not counted.

Code
ncomments <- readRDS("review_issue_nbr_comments.rds")
ncomments <- papers |>
    select(alternative.id, review_issue_id) |>
    left_join(ncomments, 
              by = join_by(alternative.id, review_issue_id))
ncomments_done <- ncomments |> 
    filter(!is.na(nbr_comments_nobot))
ncomments_todo <- ncomments |>
    filter(is.na(nbr_comments_nobot))

if (nrow(ncomments_todo) > 0) {
    ## Based on code from https://github.com/jennybc/analyze-github-stuff-with-r
    ncomments <- ncomments_todo %>%
        dplyr::slice(1:25) %>%
        mutate(res = review_issue_id %>% map(
            ~ gh(number = .x,
                 endpoint = "/repos/openjournals/joss-reviews/issues/:number/comments",
                 .limit = Inf))) %>%
        mutate(who = res %>% map(. %>% map_chr(c("user", "login")))) %>%
        select(-res) %>%
        mutate(nbr_comments = lengths(who),
               nbr_comments_nobot = vapply(
                   who, function(x) length(x[!x %in% c("whedon", "editorialbot")]), NA_integer_))
    saveRDS(bind_rows(ncomments_done, ncomments), 
            file = "review_issue_nbr_comments.rds")
}
Code
ncomments <- readRDS("review_issue_nbr_comments.rds")

(gg_nbrcomments <- ggplot(
    papers |>
        left_join(ncomments,
                  by = join_by(alternative.id, review_issue_id)), 
    aes(x = nbr_comments_nobot, y = pubyear)) + 
        geom_density_ridges(aes(fill = pubyear)) + 
        scale_fill_manual(values = yearcols, name = "Year") + 
        labs(x = "Number of comments (not by bot) in review issue", y = "") + 
        theme_cowplot() + 
        theme(legend.position = "none"))
Picking joint bandwidth of 4.36

Citation statistics

Finally, we calculate some statistics related to the citation of JOSS papers. This information has been retrieved from OpenAlex, using the openalexR R package.

All papers

Code
## All papers
tmp <- papers$citation_count
cat("Number of papers: ", length(tmp), "\n")
cat("Number of citations: ", sum(tmp, na.rm = TRUE), "\n")
cat("Summary statistics: \n")
summary(tmp)
Number of papers:  2545 
Number of citations:  65882 
Summary statistics: 
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
    0.00     0.00     2.00    25.97     9.00 10844.00        8 

The most cited paper:

Code
maxcit <- which.max(papers$citation_count)
cat(paste0(papers$author[maxcit][[1]]$family[1], " et al (", 
           papers$pubyear[maxcit], "): ", papers$title[maxcit], 
           " (", papers$citation_count[maxcit], " citations)"))
Wickham et al (2019): Welcome to the Tidyverse (10844 citations)

Papers published in 2016-2023

Code
## Papers published in 2016-2023
tmp <- papers$citation_count[papers$pubyear %in% c(2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023)]
cat("Number of papers: ", length(tmp), "\n")
cat("Number of citations: ", sum(tmp, na.rm = TRUE), "\n")
cat("Summary statistics: \n")
summary(tmp)
Number of papers:  2264 
Number of citations:  65804 
Summary statistics: 
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
    0.00     1.00     3.00    29.07    10.00 10844.00 

Compare to SoftwareX papers published in 2016-2023

Code
softwarex <- oa_fetch(entity = "works", 
                      primary_location.source.id = "s2506067282") |>
    dplyr::mutate(pubyear = year(as.Date(publication_date)))
tmp <- softwarex$cited_by_count[softwarex$pubyear %in% c(2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023)]
cat("Number of papers: ", length(tmp), "\n")
cat("Number of citations: ", sum(tmp, na.rm = TRUE), "\n")
cat("Summary statistics: \n")
summary(tmp)
Number of papers:  1174 
Number of citations:  11933 
Summary statistics: 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    1.00    3.00   10.16    9.75  431.00 

Put together final figures

Code
## Overwrite the get_legend function from cowplot temporarily, 
## since the cowplot one doesn't work with ggplot2 3.5.0
## see https://github.com/wilkelab/cowplot/issues/202

get_legend <- function(plot, legend = NULL) {
    gt <- ggplotGrob(plot)
    
    pattern <- "guide-box"
    if (!is.null(legend)) {
        pattern <- paste0(pattern, "-", legend)
    }
    
    indices <- grep(pattern, gt$layout$name)
    
    not_empty <- !vapply(
        gt$grobs[indices], 
        inherits, what = "zeroGrob", 
        FUN.VALUE = logical(1)
    )
    indices <- indices[not_empty]
    
    if (length(indices) > 0) {
        return(gt$grobs[[indices[1]]])
    }
    return(NULL)
}

Figure 1

Code
gg1 <- gg_nbrpapers + gg_nbreditors + 
    plot_annotation(tag_levels = "A", 
                    caption = paste0("Includes data until ", max(papers$published.date))) + 
    plot_layout(ncol = 2, guides = "collect") & 
    theme(legend.position = "none",
          plot.caption = element_text(size = 13))
gg2 <- get_legend(gg_nbrpapers)
plot_grid(gg1, gg2, ncol = 1, rel_heights = c(3.3, 0.7))

Figure 3

Code
gg1 <- gg_nrevpersub + gg_nbrreviewers + 
    plot_annotation(tag_levels = "A", 
                    caption = paste0("Includes data until ", max(papers$published.date))) + 
    plot_layout(ncol = 2, guides = "collect") & 
    theme(legend.position = "none",
          plot.caption = element_text(size = 13))
gg2 <- get_legend(gg_nbrpapers)
plot_grid(gg1, gg2, ncol = 1, rel_heights = c(3.3, 0.7))

Figure 4

Code
gg1 <- gg_nbrcomments + gg_timeinrev + 
    plot_annotation(tag_levels = "A", 
                    caption = paste0("Includes data until ", max(papers$published.date))) + 
    plot_layout(ncol = 2, guides = "collect") & 
    theme(legend.position = "none",
          plot.caption = element_text(size = 13))
gg2 <- get_legend(gg_nbrpapers)
plot_grid(gg1, gg2, ncol = 1, rel_heights = c(3.3, 0.7))

Session info

Session info
R version 4.4.0 (2024-04-24)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.5

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Europe/Zurich
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] openalexR_1.4.0  gh_1.4.1         purrr_1.0.2      patchwork_1.2.0 
 [5] kableExtra_1.4.0 lubridate_1.9.3  ggridges_0.5.6   cowplot_1.1.3   
 [9] tidyr_1.3.1      dplyr_1.1.4      ggplot2_3.5.1   

loaded via a namespace (and not attached):
 [1] sass_0.4.9        utf8_1.2.4        generics_0.1.3    xml2_1.3.6       
 [5] lattice_0.22-6    stringi_1.8.4     digest_0.6.36     magrittr_2.0.3   
 [9] evaluate_0.24.0   grid_4.4.0        timechange_0.3.0  fastmap_1.2.0    
[13] Matrix_1.7-0      jsonlite_1.8.8    httr_1.4.7        mgcv_1.9-1       
[17] fansi_1.0.6       crosstalk_1.2.1   viridisLite_0.4.2 scales_1.3.0     
[21] jquerylib_0.1.4   cli_3.6.3         rlang_1.1.4       splines_4.4.0    
[25] munsell_0.5.1     cachem_1.1.0      withr_3.0.0       yaml_2.3.9       
[29] tools_4.4.0       colorspace_2.1-0  DT_0.33           forcats_1.0.0    
[33] curl_5.2.1        vctrs_0.6.5       R6_2.5.1          lifecycle_1.0.4  
[37] stringr_1.5.1     htmlwidgets_1.6.4 pkgconfig_2.0.3   pillar_1.9.0     
[41] bslib_0.7.0       gtable_0.3.5      glue_1.7.0        systemfonts_1.1.0
[45] highr_0.11        xfun_0.45         tibble_3.2.1      tidyselect_1.2.1 
[49] rstudioapi_0.16.0 knitr_1.48        farver_2.1.2      nlme_3.1-164     
[53] htmltools_0.5.8.1 labeling_0.4.3    rmarkdown_2.27    svglite_2.1.3    
[57] compiler_4.4.0